Archivos y Bases de datos

La idea de este taller es manipular archivos (leerlos, parsearlos y escribirlos) y hacer lo mismo con bases de datos estructuradas.

Ejercicio 1

Baje el archivo de "All associations with added ontology annotations" del GWAS Catalog.

https://www.ebi.ac.uk/gwas/docs/file-downloads

Describa las columnas del archivo (que información estamos mirando? Para qué sirve? Por qué la hicieron?)



In [1]:

    
import pandas as pd
df = pd.read_csv('datoss.tsv', sep='\t')









    



C:\Users\Margarita\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2717: DtypeWarning: Columns (12,23,27) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)



In [ ]:

    
df [1:1]

La base de datos se compone de 37 columnas (Variables). En términos generales los datos exponen la asociación genética de un tipo de patología (eg. Cáncer, Asma) y la información "detallada" sobre el estudio que determinó dicha relación.

Esta base de datos permite almacenar de forma ordenada la relación entre rasgos genéticos y fenotípicos. Los estudios contenidos en esta base de datos tienen como intención descubrir la clave para prevenir, diagnosticar y tratar una enfermedad. Esto, también es conocido como estudio de asociación de genoma completo.

Qué Entidades (tablas) puede definir?

Journal
Platform
Study
Publicacion
Enfermedad
Loci
Enfermedad_Loci (Tabala intermedia)

Cree la base de datos (copie el código SQL que se usó)



In [8]:

    
CREATE TABLE journal
(
id_JOURNAL int auto_increment PRIMARY KEY,
namejournal varchar (300)
);

CREATE TABLE platform 
(
id_platform INT auto_increment NOT NULL PRIMARY KEY,
PLATFORMname VARCHAR(300)
);

CREATE TABLE study
(
id_STUDY int auto_increment PRIMARY KEY,
STUDY text,
INITIAL_SAMPLE_SIZE	int,
REPLICATION_SAMPLE_SIZE int,
id_platform int,
foreign key (id_platform) references platform(id_platform),
P_VALUE int,
PVALUE_MLOG int,
PVALUE_TEXT varchar (300),
CI_text varchar (300),
OR_BETA int,
MAPPED_TRAIT varchar (300),
MAPPED_TRAIT_URI varchar (300)
);

CREATE TABLE publicacion
(
id_publicacion int auto_increment PRIMARY KEY,
PUBMEDID varchar (300),
FIRSTAUTHOR varchar (300),
id_journal int,
foreign key (id_JOURNAL) references journal(id_JOURNAL),
LINK varchar (300),
STUDY_ACCESSION varchar (300),
id_STUDY int,
foreign key (id_STUDY) references study(id_STUDY)
);

CREATE TABLE enfermedad 
(
id_enfermedad int auto_increment PRIMARY KEY,
DISEASETRAITenfermedad VARCHAR(300)
);

CREATE TABLE loci
(
id_loci int auto_increment PRIMARY KEY,
REGION text,
CHR_ID text,
CHR_POS text,
REPORTED_GENE text,
MAPPED_GENE text,
UPSTREAM_GENE_ID text,
DOWNSTREAM_GENE_ID text,
SNP_GENE_IDS text,
UPSTREAM_GENE_DISTANCE	text,
DOWNSTREAM_GENE_DISTANCE text,
STRONGEST_SNP_RISK_ALLELE text,
SNPS text,
MERGED text,
SNP_ID_CURRENT text,
CONTEXT_ text,
INTERGENIC text
);


CREATE TABLE enfermedad_loci
(
id_enfermedad int,
id_loci int,
PRIMARY KEY (id_enfermedad, id_loci),
foreign key (id_enfermedad) references enfermedad(id_enfermedad),
foreign key (id_loci) references loci(id_loci)
);

Ejercicio 2

Lea el archivo y guarde la infomación en la base de datos en las tablas que se definidieron en el Ejercicio 1.



In [ ]:

    
#Leer el archivo
df.head(1)



In [3]:

    
import mysql.connector
cnx = mysql.connector.connect(user='root', password='fnsQFJ14',
                              host='127.0.0.1', database='new_schema')



In [ ]:

    
hostname = '127.0.0.1'
username = 'root'
password = 'fnsQFJ14'
database = 'new_schema'

def doQuery( conn ) :
    cur = conn.cursor()

    cur.execute( "select * from platform" )

    for id_nombre, nombre_plat in cur.fetchall() :
        print (id_nombre, nombre_plat)

myConnection = mysql.connector.connect( host=hostname, user=username, passwd=password, db=database )
doQuery( myConnection )
myConnection.close()



In [6]:

    
# Guardar informacion de plataforma
def get_platformId(names_platf):
    cur = myConnection.cursor()

    cur.execute( """select * from platform where PLATFORM_SNPS_PASSplatformING_QC = "%s" """ % (names_platf) )
    
    id_plat = None
    for id_, nombre_plat in cur.fetchall() :
        id_plat = id_
        
    if not id_plat:
        print("""insert into platform values (NULL, "%s" )""" % (names_platf))
        cur.execute("""insert into platform values (NULL, "%s" )""" % (names_platf))
        cur.execute("SELECT LAST_INSERT_ID()")
        
        id_plat = cur.fetchall()[0][0]
        
        myConnection.commit()
        
    return id_plat

hostname = '127.0.0.1'
username = 'root'
password = 'fnsQFJ14'
database = 'new_schema'

myConnection = mysql.connector.connect( host=hostname, user=username, passwd=password, db=database )

for index, row in df.iterrows():
    plat_name = row['PLATFORM [SNPS PASSING QC]']
    platform_id = get_platformId(plat_name)

print()

myConnection.close()



In [9]:

    
# Guardar informacion de journal
def get_journalId(names_journal):
    cur = myConnection.cursor()

    cur.execute( """select * from journal where namejournal = "%s" """ % (names_journal) )
    
    id_jour = None
    for id_, nombre_journ in cur.fetchall() :
        id_jour = id_
        
    if not id_jour:
        print("""insert into journal values (NULL, "%s" )""" % (names_journal))
        cur.execute("""insert into journal values (NULL, "%s" )""" % (names_journal))
        cur.execute("SELECT LAST_INSERT_ID()")
        
        id_jour = cur.fetchall()[0][0]
        
        myConnection.commit()
        
    return id_jour

hostname = '127.0.0.1'
username = 'root'
password = 'fnsQFJ14'
database = 'new_schema'

myConnection = mysql.connector.connect( host=hostname, user=username, passwd=password, db=database )

for index, row in df.iterrows():
    journal_name = row['JOURNAL']
    journal_id = get_journalId(journal_name)

print()

myConnection.close()



In [12]:

    
# Guardar información de enferemdad

def get_enfermedadlId(names_enferm):
    cur = myConnection.cursor()

    cur.execute( """select * from enfermedad where DISEASETRAIT = "%s" """ % (names_enferm) )
    
    id_enfer = None
    for id_, id_platform in cur.fetchall() :
        id_enfer = id_
        
    if not id_enfer:
        print("""insert into enfermedad values (NULL, "%s" )""" % (names_enferm))
        cur.execute("""insert into enfermedad values (NULL, "%s" )""" % (names_enferm))
        cur.execute("SELECT LAST_INSERT_ID()")
        
        id_enfer = cur.fetchall()[0][0]
        
        myConnection.commit()
        
    return id_enfer

hostname = '127.0.0.1'
username = 'root'
password = 'fnsQFJ14'
database = 'new_schema'

myConnection = mysql.connector.connect( host=hostname, user=username, passwd=password, db=database )

for index, row in df.iterrows():
    enf_name = row['DISEASE/TRAIT']
    enferm_id = get_enfermedadlId(enf_name)

print()

myConnection.close()



In [ ]:

    
#Guardar información loci
def get_lociId(regionloci, chro, chrpos, repor, mappede, geneups, genedows, snp, upstr, downs, riskalle, snps, merged,
                      snpid, contexts, intergenic):
    cur = myConnection.cursor()

    cur.execute( """select id_loci, CHR_ID, CHR_POS from loci where CHR_ID = "%s" and CHR_POS = "%s"  
                """ %(chro, chrpos))
    id_loci = None
    for id_, chrm, pos  in cur.fetchall() :
        print(id_)
        id_loci = id_
    
    if not id_loci:
        #print("""insert into loci values (NULL, "%s", "%s", "%s", "%s", "%s","%s", "%s", "%s", "%s", "%s", "%s","%s", 
        #"%s", "%s", "%s", "%s")""" % (regionloci, chro, chrpos, repor, mappede, geneups, genedows, snp, upstr, downs, 
        #riskalle, snps, merged, snpid, contexts, intergenic))
        cur.execute("""insert into loci values (NULL, "%s", "%s", "%s", "%s", "%s","%s", "%s", "%s", "%s", "%s", "%s","%s", 
        "%s", "%s", "%s", "%s")""" % (regionloci, chro, chrpos, repor, mappede, geneups, genedows, snp, upstr, downs, 
        riskalle, snps, merged, snpid, contexts, intergenic))
        cur.execute("SELECT LAST_INSERT_ID()")
        
        id_loci = cur.fetchall()[0][0]
        
        myConnection.commit()
        
    return id_loci

hostname = '127.0.0.1'
username = 'root'
password = 'fnsQFJ14'
database = 'new_schema'

myConnection = mysql.connector.connect( host=hostname, user=username, passwd=password, db=database )


for index, row in df.iterrows():
    #print(type(loci[0]), type(loci[1]), type(loci[2]), type(loci[3]), type(loci[4]), type(loci[5]), type(loci[6]), 
                #type(loci[7]),type(loci[8]), type(loci[9]), type(loci[10]), type(loci[11]), type(loci[12]),
                #type(loci[13]), type(loci[14]),type(loci[15]))
    loci = [row['REGION'], row['CHR_ID'], row['CHR_POS'],row['REPORTED GENE(S)'],row['MAPPED_GENE'],row['UPSTREAM_GENE_ID'], 
            row['DOWNSTREAM_GENE_ID'], row['SNP_GENE_IDS'],row['UPSTREAM_GENE_DISTANCE'],row['DOWNSTREAM_GENE_DISTANCE'],
            row['STRONGEST SNP-RISK ALLELE'],row['SNPS'], row['MERGED'], row['SNP_ID_CURRENT'], row['CONTEXT'], 
            row['INTERGENIC']]
    loci_id = get_lociId(loci[0], loci[1], loci[2], loci[3], loci[4], loci[5], loci[6], loci[7], loci[8], loci[9], loci[10],
                         loci[11], loci[12], loci[13], loci[14], loci[15])
    
    
print()

myConnection.close()



In [ ]:

    
#Tabla intermedia: Enfermedad_loci
hostname = '127.0.0.1'
username = 'root'
password = 'fnsQFJ14'
database = 'new_schema'

myConnection = mysql.connector.connect( host=hostname, user=username, passwd=password, db=database )   

cur = myConnection.cursor()
for index, row in df.iterrows():
    enfermedadAux = row['DISEASE/TRAIT']
    
    cur.execute("""select id_enfermedad from enfermedad where DISEASETRAIT = "%s" """ % (enfermedadAux) )
    idenferAux=cur.fetchall()
    cur.execute("""select id_loci from loci where CHR_ID = "%s" and CHR_POS = "%s" """ % (row.CHR_ID, row.CHR_POS))
    idlociAux=cur.fetchall()
    #print(idenferAux[0][0],idlociAux[0][0])
    cur.execute("""select * from enfermedad_loci where id_loci = "%d" and id_enfermedad = "%d" """ %(idlociAux[0][0], 
                                                                                                   idenferAux[0][0]))
    idiguales = cur.fetchall()
    #print(idiguales)
    
    if not idiguales:
        cur.execute("""insert into enfermedad_loci (id_enfermedad, id_loci) values ("%d", "%d")""" %(idenferAux[0][0], 
                          idlociAux[0][0]))
        myConnection.commit()
        
myConnection.close()

Ejercicio 3

Realice de la base de datos una consulta que le responda una pregunta biológica (e.g. qué genes estan relacionados con cuales enfermedades)



In [ ]:

    
#¿Cuáles genes se encuentran relacionados con el cáncer de pulmón?
#Se responde a la anterior pregunta y se expone el cromosoma en el cuál se encuentra dicho gen.

hostname = '127.0.0.1'
username = 'root'
password = 'fnsQFJ14'
database = 'new_schema'

def doQuery( conn ) :
    cur = conn.cursor()

myConnection = mysql.connector.connect( host=hostname, user=username, passwd=password, db=database )
doQuery( myConnection )
#myConnection.close()
cur = myConnection.cursor()


cur.execute("""SELECT r.CHR_ID, r.REPORTED_GENE
FROM loci r
WHERE NOT EXISTS (SELECT * FROM enfermedad i
                  WHERE DISEASETRAIT IN ('Lung cancer') 
                  AND NOT EXISTS
                      (SELECT * FROM enfermedad_loci ri
                       WHERE ri.id_loci = r.id_loci
                       AND ri.id_enfermedad = i.id_enfermedad))
""")
enferme = cur.fetchall()
print(enferme)

Ejercicio 4

Guarde el resultado de la consulta anterior en un archivo csv



In [7]:

    
import pandas as pd
my_df = pd.DataFrame(enferme)
my_df.to_csv('output.csv', index=False, header=False)